on shtooka 4876 dataset

In [35]:
# Initialize Notebook
from IPython.core.display import HTML,Image
#%run ../library/v1.0.5/init.ipy
HTML('''<script> code_show=true;  function code_toggle() {  if (code_show){  $('div.input').hide();  } else {  $('div.input').show();  }  code_show = !code_show }  $( document ).ready(code_toggle); </script> <form action="javascript:code_toggle()"><input type="submit" value="Toggle Code"></form>''')
Out[35]:
In [1]:
#import tensorflow as tf
#print (tf.__version__)
import re
#from tensorflow.examples.tutorials.mnist import input_data
from matplotlib import offsetbox
import gc, argparse, sys, os, errno
%pylab inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#sns.set()
#sns.set_style('whitegrid')
import h5py
from PIL import Image
import os
from tqdm import tqdm_notebook as tqdm
import scipy
import sklearn
from scipy.stats import pearsonr
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.io import loadmat
import IPython.display as ipd
import IPython
import librosa.display
import librosa
Populating the interactive namespace from numpy and matplotlib
/scratch/xc1490/anaconda3/lib/python3.7/site-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm
In [2]:
def audio_stft(audio,n_fft=64):
    X = librosa.stft(audio,n_fft=64)
    return  librosa.amplitude_to_db(abs(X))
In [3]:
cd ..
/scratch/xc1490/projects/ecog
In [4]:
ls data/shtooka/wav4876/reconstruct_wave/ 
ls: cannot access data/shtooka/wav4876/reconstruct_wave/: No such file or directory
In [5]:
trainpath = 'data/ddsp/audio/recon/withz/train/'
testpath = 'data/ddsp/audio/recon/withz/test/'
trainfilenum = 160
testfilenum = 150
nmels = 33
length = 4001
In [6]:
train_audio = np.ndarray([trainfilenum,64000])
train_audio_recon = np.ndarray([trainfilenum,64000])
train_spec = np.ndarray([trainfilenum,334, 385])
train_spec_recon = np.ndarray([trainfilenum,334, 385])
melspec_train = np.ndarray([trainfilenum,nmels, length])
melspec_train_recon = np.ndarray([trainfilenum,nmels, length])
for i in tqdm(range(trainfilenum)):
    train_audio[i] = librosa.core.load(trainpath+'audio_'+str(i)+'.wav',sr=16000)[0]
    train_audio_recon[i] = librosa.core.load(trainpath+'recon_audio'+str(i)+'.wav',sr=16000)[0]
    train_spec[i] = np.load(trainpath+'spec_'+str(i)+'.npy')
    train_spec_recon[i] =  np.load(trainpath+'recon_spec_'+str(i)+'.npy')
    melspec_train[i] = audio_stft(train_audio[i])
    melspec_train_recon[i] = audio_stft(train_audio_recon[i])
    #melspec_train_recon[i] = librosa.feature.melspectrogram(y=train_audio_recon[i], sr=16000, n_mels=nmels,fmax=8000)

In [7]:
test_audio = np.ndarray([testfilenum,64000])
test_audio_recon = np.ndarray([testfilenum,64000])
test_spec = np.ndarray([testfilenum,334, 385])
test_spec_recon = np.ndarray([testfilenum,334, 385])
melspec_test = np.ndarray([testfilenum,nmels, length])
melspec_test_recon = np.ndarray([testfilenum,nmels, length])
for i in tqdm(range(testfilenum)):
    test_audio[i] = librosa.core.load(testpath+'audio_'+str(i)+'.wav',sr=16000)[0]
    test_audio_recon[i] = librosa.core.load(testpath+'recon_audio'+str(i)+'.wav',sr=16000)[0]
    test_spec[i] = np.load(testpath+'spec_'+str(i)+'.npy')
    test_spec_recon[i] =  np.load(testpath+'recon_spec_'+str(i)+'.npy')
    melspec_test[i] = audio_stft(test_audio[i])
    melspec_test_recon[i] = audio_stft(test_audio_recon[i])
    #melspec_test[i] = librosa.feature.melspectrogram(y=test_audio[i], sr=16000, n_mels=nmels,fmax=8000)
    #melspec_test_recon[i] = librosa.feature.melspectrogram(y=test_audio_recon[i], sr=16000, n_mels=nmels,fmax=8000)

siamese VAE result

In [8]:
def MSE_pcc(A,B,ax=None):
    mse =np.mean(((A - B)**2/B.var()))
    #mse = (np.square(A - B)).mean(axis=ax)
    pcc = pearsonr(A.ravel(),B.ravel())[0]
    return mse,pcc
def analyze(predict,GT_STFT_test_spkr):
    samples = predict.shape[0]
    pcc = np.zeros([samples])
    mse = np.zeros([samples])
    for i in tqdm(range(samples)):
        mse[i], pcc[i] = MSE_pcc(predict[i],GT_STFT_test_spkr[i])
    fig,ax=plt.subplots(1,2,figsize=(16,4))
    ax[0].hist(mse,bins=25,color='b')
    ax[0].set_title('MSE: %g(%g)' %(np.round(mse.mean(),3),np.round(mse.std(),3)))
    ax[1].hist(pcc,bins=50,color='g')
    ax[1].set_title('PCC: %g(%g)' %(np.round(pcc.mean(),3),np.round(pcc.std(),3)))
    return mse,pcc
In [9]:
GT_STFT_train_spkr, predict_train = train_spec,train_spec_recon
mse,pcc=analyze(predict_train,GT_STFT_train_spkr)

In [10]:
GT_STFT_test_spkr, predict  = test_spec,test_spec_recon
mse,pcc=analyze(predict,GT_STFT_test_spkr)

In [11]:
#MSE_pcc(predict , GT_STFT_test_spkr)
mse,pcc=analyze(melspec_test,melspec_test_recon)

TEST

In [12]:
rownum = 2
columnnum = 8
tmptest = GT_STFT_test_spkr[:rownum *columnnum]
tmppred = predict[:rownum *columnnum]
tmpvisarr = [ np.concatenate((tmptest[i].T, tmppred[i].T)) for i in range(tmptest.shape[0])]

fig,ax=plt.subplots(rownum,columnnum,figsize=(columnnum*2,rownum *4))
for i in range(columnnum):
    for j in range(rownum):
        ax[j,i].imshow(tmpvisarr[columnnum*j+i],cmap=cm.Blues)
        #ax[j*2,i].imshow(GT_STFT_test_spkr[np.argsort(-pcc)[columnnum*j+i]],cmap=cm.Blues)
        #ax[j*2+1,i].imshow( predict[np.argsort(-pcc)[columnnum*j+i]],cmap=cm.Blues)
plt.tight_layout()
In [13]:
rownum = 2
columnnum = 8
fig,ax=plt.subplots(rownum*2,columnnum,figsize=(columnnum*2,rownum *4))
for i in range(columnnum):
    for j in range(rownum):
        ax[j*2,i].imshow(GT_STFT_test_spkr[columnnum*j+i].T,cmap=cm.Blues)
        ax[j*2+1,i].imshow( predict[columnnum*j+i].T,cmap=cm.Blues)
plt.tight_layout()

audio to mel test

In [14]:
rownum = 2
columnnum = 8
fig,ax=plt.subplots(rownum*2,columnnum,figsize=(columnnum*2,rownum *4))
for i in range(columnnum):
    for j in range(rownum):
        librosa.display.specshow(melspec_test[columnnum*j+i],
                          sr=16000,ax= ax[j*2,i])
        librosa.display.specshow(melspec_test_recon[columnnum*j+i],
                          sr=16000,ax= ax[j*2+1,i])
plt.tight_layout()

TRAIN

In [15]:
rownum = 2
columnnum = 8
fig,ax=plt.subplots(rownum*2,columnnum,figsize=(columnnum*2,rownum *4))
for i in range(columnnum):
    for j in range(rownum):
        ax[j*2,i].imshow(GT_STFT_train_spkr[columnnum*j+i].T,cmap=cm.Blues)
        ax[j*2+1,i].imshow( predict_train[columnnum*j+i].T,cmap=cm.Blues)
plt.tight_layout()
In [16]:
rownum = 2
columnnum = 8
fig,ax=plt.subplots(rownum*2,columnnum,figsize=(columnnum*2,rownum *4))
for i in range(columnnum):
    for j in range(rownum):
        librosa.display.specshow(GT_STFT_train_spkr[columnnum*j+i].T,
                          sr=16000,ax= ax[j*2,i])
        librosa.display.specshow(GT_STFT_train_spkr[columnnum*j+i].T,
                          sr=16000,ax= ax[j*2+1,i])
plt.tight_layout()

audio

In [17]:
ind = 2
fig,ax=plt.subplots(2,1,figsize=(25,18))
librosa.display.waveplot(test_audio[ind], sr=16000,ax=ax[0])
librosa.display.waveplot(test_audio_recon[ind], sr=16000,ax=ax[1])
ax[0].set_title('Ground Truth Audio',fontsize=22)
ax[1].set_title('DDSP Generated Audio',fontsize=22)
ax[0].set_ylim(-0.8,0.8)
ax[1].set_ylim(-0.8,0.8)
fig.tight_layout()
display(ipd.Audio(test_audio[ind],rate=16000))
display(ipd.Audio(test_audio_recon[ind],rate=16000))
In [18]:
ind = 10
fig,ax=plt.subplots(2,1,figsize=(25,18))
librosa.display.waveplot(test_audio[ind], sr=16000,ax=ax[0])
librosa.display.waveplot(test_audio_recon[ind], sr=16000,ax=ax[1])
ax[0].set_title('Ground Truth Audio',fontsize=22)
ax[1].set_title('DDSP Generated Audio',fontsize=22)
ax[0].set_ylim(-0.8,0.8)
ax[1].set_ylim(-0.8,0.8)
fig.tight_layout()
display(ipd.Audio(test_audio[ind],rate=16000))
display(ipd.Audio(test_audio_recon[ind],rate=16000))
In [19]:
ind = 21
fig,ax=plt.subplots(2,1,figsize=(25,18))
librosa.display.waveplot(test_audio[ind], sr=16000,ax=ax[0])
librosa.display.waveplot(test_audio_recon[ind], sr=16000,ax=ax[1])
ax[0].set_title('Ground Truth Audio',fontsize=22)
ax[1].set_title('DDSP Generated Audio',fontsize=22)
ax[0].set_ylim(-0.8,0.8)
ax[1].set_ylim(-0.8,0.8)
fig.tight_layout()
display(ipd.Audio(test_audio[ind],rate=16000))
display(ipd.Audio(test_audio_recon[ind],rate=16000))
In [ ]:
 
In [ ]: